The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed. The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).
# Reading datafile
bank.additional.full <- read_delim("../Data/bank-additional-full.csv", delim = ";")
## Parsed with column specification:
## cols(
## .default = col_character(),
## age = col_double(),
## duration = col_double(),
## campaign = col_double(),
## pdays = col_double(),
## previous = col_double(),
## emp.var.rate = col_double(),
## cons.price.idx = col_double(),
## cons.conf.idx = col_double(),
## euribor3m = col_double(),
## nr.employed = col_double()
## )
## See spec(...) for full column specifications.
#Missing data?
vis_miss(bank.additional.full)
## TRANSFORMING DATA (Suchi)
#Converting the categorical variables as factors
bank.additional.full$y <- as.factor(bank.additional.full$y)
bank.additional.full$job <- as.factor(bank.additional.full$job)
bank.additional.full$marital <- as.factor(bank.additional.full$marital)
bank.additional.full$education <- as.factor(bank.additional.full$education)
bank.additional.full$default <- as.factor(bank.additional.full$default)
bank.additional.full$housing <- as.factor(bank.additional.full$housing)
bank.additional.full$loan <- as.factor(bank.additional.full$loan)
bank.additional.full$contact <- as.factor(bank.additional.full$contact)
bank.additional.full$day_of_week <- as.factor(bank.additional.full$day_of_week)
bank.additional.full$poutcome <- as.factor(bank.additional.full$poutcome)
bank.additional.full$month <- as.factor(bank.additional.full$month)
#Removing records from bank.additional.full where value of default = 'yes' as it might cause issue when picked in non-balance way.
bank.additional.full <- bank.additional.full %>% filter (default!='yes')
# Separating out yes and no observations
bank.additional.no <- bank.additional.full %>% filter (y=='no')
bank.additional.yes <- bank.additional.full %>% filter (y=='yes')
# Picking 1000 sample each from yes and no
set.seed(1234)
index.no<-sample(1:nrow(bank.additional.no),2000,replace=FALSE)
index.yes<-sample(1:nrow(bank.additional.yes),2000,replace=FALSE)
bank.additional.sample.no<-bank.additional.no[index.no,]
bank.additional.sample.yes<-bank.additional.yes[index.yes,]
bank.additional.sample <- rbind(bank.additional.sample.no,bank.additional.sample.yes)
#Splitting train and test data set
set.seed(1234)
index<-sample(1:4000,3000,replace=FALSE)
bank.additional.sample <- as.data.frame(bank.additional.sample)
bank.additional.sample.train<-bank.additional.sample[index,]
bank.additional.sample.train <- as.data.frame(bank.additional.sample.train)
bank.additional.sample.test <-bank.additional.sample[-index,]
bank.additional.sample.test <- as.data.frame(bank.additional.sample.test)
#summary of dataset (Hollie)
summary(bank.additional.full)
## age job marital
## Min. :17.00 admin. :10422 divorced: 4612
## 1st Qu.:32.00 blue-collar: 9254 married :24925
## Median :38.00 technician : 6741 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6155
## education default housing loan
## university.degree :12168 no :32588 no :18620 no :33947
## high.school : 9514 unknown: 8597 unknown: 990 unknown: 990
## basic.9y : 6045 yes : 0 yes :21575 yes : 6248
## professional.course: 5241
## basic.4y : 4176
## basic.6y : 2292
## (Other) : 1749
## contact month day_of_week duration
## cellular :26141 may :13769 fri:7827 Min. : 0.0
## telephone:15044 jul : 7174 mon:8514 1st Qu.: 102.0
## aug : 6176 thu:8623 Median : 180.0
## jun : 5318 tue:8087 Mean : 258.3
## nov : 4100 wed:8134 3rd Qu.: 319.0
## apr : 2632 Max. :4918.0
## (Other): 2016
## campaign pdays previous poutcome
## Min. : 1.000 Min. : 0.0 Min. :0.000 failure : 4251
## 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000 nonexistent:35561
## Median : 2.000 Median :999.0 Median :0.000 success : 1373
## Mean : 2.568 Mean :962.5 Mean :0.173
## 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :56.000 Max. :999.0 Max. :7.000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.40000 Min. :92.20 Min. :-50.8 Min. :0.634
## 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.344
## Median : 1.10000 Median :93.75 Median :-41.8 Median :4.857
## Mean : 0.08183 Mean :93.58 Mean :-40.5 Mean :3.621
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961
## Max. : 1.40000 Max. :94.77 Max. :-26.9 Max. :5.045
##
## nr.employed y
## Min. :4964 no :36545
## 1st Qu.:5099 yes: 4640
## Median :5191
## Mean :5167
## 3rd Qu.:5228
## Max. :5228
##
summary(bank.additional.sample)
## age job marital education
## Min. :17.00 admin. :1038 divorced: 466 university.degree :1293
## 1st Qu.:32.00 blue-collar: 773 married :2327 high.school : 893
## Median :38.00 technician : 653 single :1199 professional.course: 511
## Mean :40.64 services : 335 unknown : 8 basic.9y : 496
## 3rd Qu.:48.00 management : 293 basic.4y : 418
## Max. :98.00 retired : 261 basic.6y : 202
## (Other) : 647 (Other) : 187
## default housing loan contact month
## no :3362 no :1754 no :3287 cellular :2862 may :1070
## unknown: 638 unknown: 100 unknown: 100 telephone:1138 jul : 634
## yes : 0 yes :2146 yes : 613 aug : 584
## jun : 499
## nov : 396
## apr : 348
## (Other): 469
## day_of_week duration campaign pdays previous
## fri:717 Min. : 5.0 Min. : 1.000 Min. : 0 Min. :0.0000
## mon:789 1st Qu.: 150.0 1st Qu.: 1.000 1st Qu.:999 1st Qu.:0.0000
## thu:846 Median : 271.0 Median : 2.000 Median :999 Median :0.0000
## tue:808 Mean : 397.9 Mean : 2.313 Mean :893 Mean :0.3147
## wed:840 3rd Qu.: 538.2 3rd Qu.: 3.000 3rd Qu.:999 3rd Qu.:0.0000
## Max. :3509.0 Max. :37.000 Max. :999 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure : 486 Min. :-3.4000 Min. :92.20 Min. :-50.80
## nonexistent:3121 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70
## success : 393 Median :-0.1000 Median :93.44 Median :-41.80
## Mean :-0.4606 Mean :93.49 Mean :-40.29
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40
## Max. : 1.4000 Max. :94.77 Max. :-26.90
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :2000
## 1st Qu.:1.250 1st Qu.:5076 yes:2000
## Median :4.021 Median :5191
## Mean :2.996 Mean :5137
## 3rd Qu.:4.959 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
summary(bank.additional.sample.train)
## age job marital education
## Min. :17.00 admin. :781 divorced: 335 university.degree :970
## 1st Qu.:31.00 blue-collar:570 married :1737 high.school :685
## Median :38.00 technician :499 single : 921 professional.course:380
## Mean :40.61 services :249 unknown : 7 basic.9y :356
## 3rd Qu.:48.00 management :222 basic.4y :306
## Max. :98.00 retired :199 basic.6y :150
## (Other) :480 (Other) :153
## default housing loan contact month
## no :2520 no :1297 no :2467 cellular :2151 may :808
## unknown: 480 unknown: 75 unknown: 75 telephone: 849 jul :475
## yes : 0 yes :1628 yes : 458 aug :439
## jun :362
## nov :300
## apr :262
## (Other):354
## day_of_week duration campaign pdays previous
## fri:554 Min. : 5.0 Min. : 1.000 Min. : 0.0 Min. :0.0000
## mon:570 1st Qu.: 147.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
## thu:642 Median : 268.5 Median : 2.000 Median :999.0 Median :0.0000
## tue:606 Mean : 391.9 Mean : 2.328 Mean :894.1 Mean :0.3187
## wed:628 3rd Qu.: 524.2 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :3509.0 Max. :37.000 Max. :999.0 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure : 366 Min. :-3.4000 Min. :92.20 Min. :-50.80
## nonexistent:2339 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70
## success : 295 Median :-0.1000 Median :93.44 Median :-41.80
## Mean :-0.4622 Mean :93.49 Mean :-40.27
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40
## Max. : 1.4000 Max. :94.77 Max. :-26.90
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :1513
## 1st Qu.:1.244 1st Qu.:5076 yes:1487
## Median :4.076 Median :5191
## Mean :2.992 Mean :5136
## 3rd Qu.:4.959 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
#boxplot of response variable (y) versus all other continous variables (Suchi)
bank.additional.sample %>% ggplot(aes(x = y, y=age)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success by Age") + ylab("Age")
bank.additional.sample %>% ggplot(aes(x = y, y=duration)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success by duration") + ylab("duration")
bank.additional.sample %>% ggplot(aes(x = y, y=campaign)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success by campaign") + ylab("campaign")
bank.additional.sample %>% ggplot(aes(x = y, y=pdays)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success") + ylab("Days after last contact")
bank.additional.sample %>% ggplot(aes(x = y, y=previous)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("Number of contacts done before")
bank.additional.sample %>% ggplot(aes(x = y, y=emp.var.rate )) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("employment variation rate - quarterly indicator")
bank.additional.sample %>% ggplot(aes(x = y, y=cons.price.idx)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("consumer price index - monthly indicator")
bank.additional.sample %>% ggplot(aes(x = y, y=cons.conf.idx)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("consumer confidence index - monthly indicator")
bank.additional.sample %>% ggplot(aes(x = y, y=euribor3m)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success") + ylab("euribor 3 month rate - daily indicator")
bank.additional.sample %>% ggplot(aes(x = y, y=nr.employed)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("number of employees - quarterly indicator")
#Visualize Categorical variables (Suchi)
#Table of counts are helpful for categorical variables
attach(bank.additional.sample)
## The following object is masked from package:MASS:
##
## housing
ftable(addmargins(table(y,job)))
## job admin. blue-collar entrepreneur housemaid management retired self-employed services student technician unemployed unknown Sum
## y
## no 472 473 74 51 153 67 68 189 38 351 46 18 2000
## yes 566 300 56 45 140 194 69 146 100 302 64 18 2000
## Sum 1038 773 130 96 293 261 137 335 138 653 110 36 4000
ftable(addmargins(table(y,marital)))
## marital divorced married single unknown Sum
## y
## no 249 1196 552 3 2000
## yes 217 1131 647 5 2000
## Sum 466 2327 1199 8 4000
ftable(addmargins(table(y,education)))
## education basic.4y basic.6y basic.9y high.school illiterate professional.course university.degree unknown Sum
## y
## no 203 127 302 456 0 249 587 76 2000
## yes 215 75 194 437 3 262 706 108 2000
## Sum 418 202 496 893 3 511 1293 184 4000
ftable(addmargins(table(y,default)))
## default no unknown yes Sum
## y
## no 1566 434 0 2000
## yes 1796 204 0 2000
## Sum 3362 638 0 4000
ftable(addmargins(table(y,housing)))
## housing no unknown yes Sum
## y
## no 876 58 1066 2000
## yes 878 42 1080 2000
## Sum 1754 100 2146 4000
ftable(addmargins(table(y,loan)))
## loan no unknown yes Sum
## y
## no 1634 58 308 2000
## yes 1653 42 305 2000
## Sum 3287 100 613 4000
ftable(addmargins(table(y,contact)))
## contact cellular telephone Sum
## y
## no 1224 776 2000
## yes 1638 362 2000
## Sum 2862 1138 4000
ftable(addmargins(table(y,month)))
## month apr aug dec jul jun mar may nov oct sep Sum
## y
## no 114 300 6 365 268 20 686 205 23 13 2000
## yes 234 284 32 269 231 120 384 191 139 116 2000
## Sum 348 584 38 634 499 140 1070 396 162 129 4000
ftable(addmargins(table(y,day_of_week)))
## day_of_week fri mon thu tue wed Sum
## y
## no 367 450 397 366 420 2000
## yes 350 339 449 442 420 2000
## Sum 717 789 846 808 840 4000
ftable(addmargins(table(y,poutcome)))
## poutcome failure nonexistent success Sum
## y
## no 221 1755 24 2000
## yes 265 1366 369 2000
## Sum 486 3121 393 4000
#to get proportions that make sense
prop.table(table(y,job),2)
## job
## y admin. blue-collar entrepreneur housemaid management retired
## no 0.4547206 0.6119017 0.5692308 0.5312500 0.5221843 0.2567050
## yes 0.5452794 0.3880983 0.4307692 0.4687500 0.4778157 0.7432950
## job
## y self-employed services student technician unemployed unknown
## no 0.4963504 0.5641791 0.2753623 0.5375191 0.4181818 0.5000000
## yes 0.5036496 0.4358209 0.7246377 0.4624809 0.5818182 0.5000000
plot(y~job,col=c("red","blue"))
prop.table(table(y,marital),2)
## marital
## y divorced married single unknown
## no 0.5343348 0.5139665 0.4603837 0.3750000
## yes 0.4656652 0.4860335 0.5396163 0.6250000
plot(y~marital,col=c("red","blue"))
prop.table(table(y,education),2)
## education
## y basic.4y basic.6y basic.9y high.school illiterate professional.course
## no 0.4856459 0.6287129 0.6088710 0.5106383 0.0000000 0.4872798
## yes 0.5143541 0.3712871 0.3911290 0.4893617 1.0000000 0.5127202
## education
## y university.degree unknown
## no 0.4539830 0.4130435
## yes 0.5460170 0.5869565
plot(y~education,col=c("red","blue"))
prop.table(table(y,default),2)
## default
## y no unknown yes
## no 0.4657942 0.6802508
## yes 0.5342058 0.3197492
plot(y~default,col=c("red","blue"))
prop.table(table(y,housing),2)
## housing
## y no unknown yes
## no 0.4994299 0.5800000 0.4967381
## yes 0.5005701 0.4200000 0.5032619
plot(y~housing,col=c("red","blue"))
prop.table(table(y,loan),2)
## loan
## y no unknown yes
## no 0.4971098 0.5800000 0.5024470
## yes 0.5028902 0.4200000 0.4975530
plot(y~loan,col=c("red","blue"))
prop.table(table(y,contact),2)
## contact
## y cellular telephone
## no 0.4276730 0.6818981
## yes 0.5723270 0.3181019
plot(y~contact,col=c("red","blue"))
prop.table(table(y,month),2)
## month
## y apr aug dec jul jun mar may
## no 0.3275862 0.5136986 0.1578947 0.5757098 0.5370741 0.1428571 0.6411215
## yes 0.6724138 0.4863014 0.8421053 0.4242902 0.4629259 0.8571429 0.3588785
## month
## y nov oct sep
## no 0.5176768 0.1419753 0.1007752
## yes 0.4823232 0.8580247 0.8992248
plot(y~month,col=c("red","blue"))
prop.table(table(y,day_of_week),2)
## day_of_week
## y fri mon thu tue wed
## no 0.5118550 0.5703422 0.4692671 0.4529703 0.5000000
## yes 0.4881450 0.4296578 0.5307329 0.5470297 0.5000000
plot(y~day_of_week,col=c("red","blue"))
prop.table(table(y,poutcome),2)
## poutcome
## y failure nonexistent success
## no 0.4547325 0.5623198 0.0610687
## yes 0.5452675 0.4376802 0.9389313
plot(y~poutcome,col=c("red","blue"))
Drawing corr plot to check collinearity between continous variables. However, for few, was not very clear if there is linear relationship. Putting additional scatter plots for those.
#Logistics Regression Assumption Check (Suchi)
#Overall EDA to check collinearity between continous variables
ggpairs(bank.additional.full,columns=c(1,11:14,16:20),aes(colour=y))
# ggpairs(bank.additional.sample,columns=c(1,11:14,16:20),aes(colour=y))
ggplot(bank.additional.full, aes(x = emp.var.rate , y = cons.price.idx)) + geom_point() + xlab("employment variation rate - quarterly indicator") + ylab("consumer price index") + ggtitle("employment variation rate vs consumer price index")
ggplot(bank.additional.full, aes(x = emp.var.rate , y = cons.conf.idx)) + geom_point() + xlab("employment variation rate - quarterly indicator") + ylab("consumer confidence index") + ggtitle("employment variation rate vs consumer confidence index")
ggplot(bank.additional.full, aes(x = emp.var.rate , y = nr.employed)) + geom_point() + xlab("employment variation rate - quarterly indicator") + ylab("number of employees")+ ggtitle("employment variation rate vs number of employees")
ggplot(bank.additional.full, aes(x = cons.price.idx , y = cons.conf.idx)) + geom_point() + xlab("consumer price index") + ylab("consumer confidence index") + ggtitle("Relation between consumer price index and consumer confidence index")
ggplot(bank.additional.full, aes(x = cons.price.idx , y = nr.employed)) + geom_point() + xlab("consumer price index") + ylab("number of employees") + ggtitle("Relation between consumer price index and number of employees")
ggplot(bank.additional.full, aes(x = cons.conf.idx , y = nr.employed)) + geom_point() + xlab("consumer confidence index") + ylab("number of employees") + ggtitle("Relation between consumer confidence index and number of employees")
# Checking significance of individual variables in predicting outcome (Suchi)
summary(glm(y~age , family="binomial",data=bank.additional.sample.train)) ##From boxplot, it did not look significant but from glm, it does look OK.
##
## Call:
## glm(formula = y ~ age, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.355 -1.160 -1.100 1.199 1.276
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.381256 0.126289 -3.019 0.00254 **
## age 0.008963 0.002978 3.009 0.00262 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4149.5 on 2998 degrees of freedom
## AIC: 4153.5
##
## Number of Fisher Scoring iterations: 3
summary(glm(y~job , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ job, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.622 -1.102 -1.002 1.206 1.363
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.13593 0.07173 1.895 0.058088 .
## jobblue-collar -0.56338 0.11175 -5.041 4.62e-07 ***
## jobentrepreneur -0.54140 0.21636 -2.502 0.012339 *
## jobhousemaid -0.10926 0.24184 -0.452 0.651415
## jobmanagement -0.33478 0.15278 -2.191 0.028433 *
## jobretired 0.85182 0.17482 4.873 1.10e-06 ***
## jobself-employed -0.20337 0.22392 -0.908 0.363751
## jobservices -0.33741 0.14620 -2.308 0.021001 *
## jobstudent 0.86622 0.22866 3.788 0.000152 ***
## jobtechnician -0.31678 0.11501 -2.754 0.005879 **
## jobunemployed 0.23876 0.23724 1.006 0.314213
## jobunknown -0.21004 0.39179 -0.536 0.591884
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4046.8 on 2988 degrees of freedom
## AIC: 4070.8
##
## Number of Fisher Scoring iterations: 4
summary(glm(y~marital , family="binomial",data=bank.additional.sample.train)) ## Not at all
##
## Call:
## glm(formula = y ~ marital, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.302 -1.152 -1.095 1.203 1.262
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1977 0.1098 -1.800 0.0719 .
## maritalmarried 0.1366 0.1198 1.140 0.2543
## maritalsingle 0.3260 0.1281 2.544 0.0110 *
## maritalunknown 0.4853 0.7716 0.629 0.5293
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4150.1 on 2996 degrees of freedom
## AIC: 4158.1
##
## Number of Fisher Scoring iterations: 3
summary(glm(y~education , family="binomial",data=bank.additional.sample.train)) ## Meh
##
## Call:
## glm(formula = y ~ education, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2933 -1.1774 -0.9448 1.1333 1.4294
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.10467 0.11449 0.914 0.360590
## educationbasic.6y -0.68003 0.20504 -3.317 0.000911 ***
## educationbasic.9y -0.57376 0.15803 -3.631 0.000283 ***
## educationhigh.school -0.16600 0.13767 -1.206 0.227886
## educationilliterate 12.46139 187.49090 0.066 0.947008
## educationprofessional.course -0.10467 0.15373 -0.681 0.495962
## educationuniversity.degree 0.06065 0.13138 0.462 0.644313
## educationunknown 0.16359 0.20064 0.815 0.414871
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4113.0 on 2992 degrees of freedom
## AIC: 4129
##
## Number of Fisher Scoring iterations: 11
summary(glm(y~default , family="binomial",data=bank.additional.sample.train)) ## Okay, will get back to you!!!!
##
## Call:
## glm(formula = y ~ default, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2298 -1.2298 -0.8657 1.1259 1.5252
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.12237 0.03992 3.066 0.00217 **
## defaultunknown -0.91083 0.10626 -8.572 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4080.3 on 2998 degrees of freedom
## AIC: 4084.3
##
## Number of Fisher Scoring iterations: 4
summary(glm(y~housing , family="binomial",data=bank.additional.sample.train)) ## Not at all
##
## Call:
## glm(formula = y ~ housing, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.176 -1.176 -1.033 1.178 1.329
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.01696 0.05554 -0.305 0.760
## housingunknown -0.33324 0.24097 -1.383 0.167
## housingyes 0.01451 0.07444 0.195 0.845
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4156.5 on 2997 degrees of freedom
## AIC: 4162.5
##
## Number of Fisher Scoring iterations: 3
summary(glm(y~loan , family="binomial",data=bank.additional.sample.train)) ## Not at all
##
## Call:
## glm(formula = y ~ loan, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.175 -1.175 -1.033 1.180 1.329
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.005675 0.040267 -0.141 0.888
## loanunknown -0.344527 0.237912 -1.448 0.148
## loanyes -0.020527 0.101767 -0.202 0.840
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4156.5 on 2997 degrees of freedom
## AIC: 4162.5
##
## Number of Fisher Scoring iterations: 3
summary(glm(y~contact , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ contact, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.295 -1.295 -0.867 1.064 1.524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.27224 0.04352 6.255 3.97e-10 ***
## contacttelephone -1.05693 0.08584 -12.312 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3998.1 on 2998 degrees of freedom
## AIC: 4002.1
##
## Number of Fisher Scoring iterations: 4
summary(glm(y~month , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ month, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1710 -1.0212 -0.9368 1.2247 1.4388
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.7510 0.1324 5.673 1.40e-08 ***
## monthaug -0.8285 0.1632 -5.075 3.87e-07 ***
## monthdec 0.7531 0.4703 1.601 0.109337
## monthjul -1.1302 0.1620 -6.976 3.05e-12 ***
## monthjun -0.8616 0.1691 -5.094 3.50e-07 ***
## monthmar 1.0408 0.3087 3.371 0.000748 ***
## monthmay -1.3472 0.1514 -8.898 < 2e-16 ***
## monthnov -0.8711 0.1758 -4.955 7.22e-07 ***
## monthoct 0.9934 0.2877 3.453 0.000555 ***
## monthsep 1.5062 0.3745 4.022 5.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3824.0 on 2990 degrees of freedom
## AIC: 3844
##
## Number of Fisher Scoring iterations: 4
summary(glm(y~day_of_week , family="binomial",data=bank.additional.sample.train))## Not at all
##
## Call:
## glm(formula = y ~ day_of_week, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.257 -1.159 -1.031 1.196 1.331
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.050552 0.084999 -0.595 0.5520
## day_of_weekmon -0.303993 0.120269 -2.528 0.0115 *
## day_of_weekthu 0.194100 0.116136 1.671 0.0947 .
## day_of_weektue 0.235900 0.117823 2.002 0.0453 *
## day_of_weekwed 0.005959 0.116608 0.051 0.9592
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4132.1 on 2995 degrees of freedom
## AIC: 4142.1
##
## Number of Fisher Scoring iterations: 3
summary(glm(y~duration , family="binomial",data=bank.additional.sample.train)) ## This is not a true predictor
##
## Call:
## glm(formula = y ~ duration, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.4693 -0.8492 -0.6090 0.9577 1.7715
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6334204 0.0758929 -21.52 <2e-16 ***
## duration 0.0047279 0.0002114 22.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3263.2 on 2998 degrees of freedom
## AIC: 3267.2
##
## Number of Fisher Scoring iterations: 5
summary(glm(y~campaign , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ campaign, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2313 -1.1831 -0.6014 1.1244 2.2098
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.23844 0.05485 4.347 1.38e-05 ***
## campaign -0.11257 0.01854 -6.071 1.27e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4114.0 on 2998 degrees of freedom
## AIC: 4118
##
## Number of Fisher Scoring iterations: 3
summary(glm(y~pdays , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ pdays, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.377 -1.082 -1.082 1.276 1.276
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7706092 0.2380557 11.64 <2e-16 ***
## pdays -0.0030020 0.0002415 -12.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3828.4 on 2998 degrees of freedom
## AIC: 3832.4
##
## Number of Fisher Scoring iterations: 5
summary(glm(y~previous , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ previous, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.332 -1.060 -1.060 1.300 1.300
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.28364 0.04116 -6.891 5.55e-12 ***
## previous 0.97806 0.07613 12.848 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3926.0 on 2998 degrees of freedom
## AIC: 3930
##
## Number of Fisher Scoring iterations: 4
summary(glm(y~poutcome , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ poutcome, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.389 -1.060 -1.060 1.299 1.299
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2195 0.1052 2.087 0.0369 *
## poutcomenonexistent -0.5009 0.1132 -4.427 9.58e-06 ***
## poutcomesuccess 2.5750 0.2711 9.499 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3829.7 on 2997 degrees of freedom
## AIC: 3835.7
##
## Number of Fisher Scoring iterations: 5
summary(glm(y~emp.var.rate , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ emp.var.rate, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9118 -0.8277 -0.7697 0.8810 1.6500
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.27253 0.04198 -6.492 8.49e-11 ***
## emp.var.rate -0.56606 0.02493 -22.710 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3557.8 on 2998 degrees of freedom
## AIC: 3561.8
##
## Number of Fisher Scoring iterations: 4
summary(glm(y~cons.price.idx , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ cons.price.idx, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5416 -1.0536 -0.9142 1.1056 1.5547
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 61.13741 5.65717 10.81 <2e-16 ***
## cons.price.idx -0.65414 0.06051 -10.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4036.8 on 2998 degrees of freedom
## AIC: 4040.8
##
## Number of Fisher Scoring iterations: 4
summary(glm(y~cons.conf.idx , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ cons.conf.idx, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.363 -1.149 -1.037 1.185 1.336
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.317221 0.279983 4.705 2.54e-06 ***
## cons.conf.idx 0.033144 0.006893 4.809 1.52e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 4135.3 on 2998 degrees of freedom
## AIC: 4139.3
##
## Number of Fisher Scoring iterations: 3
summary(glm(y~euribor3m , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ euribor3m, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7086 -0.7992 -0.7800 0.8364 1.6539
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.52188 0.07714 19.73 <2e-16 ***
## euribor3m -0.51450 0.02183 -23.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3531.1 on 2998 degrees of freedom
## AIC: 3535.1
##
## Number of Fisher Scoring iterations: 4
summary(glm(y~nr.employed , family="binomial",data=bank.additional.sample.train))
##
## Call:
## glm(formula = y ~ nr.employed, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1351 -0.8903 -0.7504 0.9801 1.6762
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 63.9997285 2.7051431 23.66 <2e-16 ***
## nr.employed -0.0124563 0.0005258 -23.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3455.0 on 2998 degrees of freedom
## AIC: 3459
##
## Number of Fisher Scoring iterations: 4
#Suchi
#Type of Selection (Manual / Intuition)
#fitting a simple model by using only those predictors which appeared to be significant in EDA and individual variable check -- ##Kind of Manual backward selection
simple.logistics.all.significant<-glm(y~ job + contact + month + campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
summary(simple.logistics.all.significant)
##
## Call:
## glm(formula = y ~ job + contact + month + campaign + pdays +
## previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
## euribor3m + nr.employed, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7513 -0.8615 -0.6019 0.8025 1.9204
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.320e+02 8.705e+01 -3.814 0.000137 ***
## jobblue-collar -3.822e-03 1.307e-01 -0.029 0.976663
## jobentrepreneur 3.588e-02 2.469e-01 0.145 0.884460
## jobhousemaid 7.902e-02 2.861e-01 0.276 0.782418
## jobmanagement -8.257e-02 1.811e-01 -0.456 0.648444
## jobretired 1.159e-01 2.127e-01 0.545 0.585903
## jobself-employed 1.419e-01 2.606e-01 0.545 0.586016
## jobservices 1.021e-01 1.678e-01 0.608 0.543096
## jobstudent 1.204e-01 2.710e-01 0.444 0.656800
## jobtechnician -1.055e-01 1.371e-01 -0.770 0.441590
## jobunemployed 2.671e-01 2.839e-01 0.941 0.346835
## jobunknown -2.010e-01 4.852e-01 -0.414 0.678753
## contacttelephone -6.506e-01 1.684e-01 -3.864 0.000112 ***
## monthaug 7.814e-01 3.313e-01 2.358 0.018353 *
## monthdec -3.174e-01 5.342e-01 -0.594 0.552382
## monthjul 8.123e-02 2.197e-01 0.370 0.711543
## monthjun -1.077e+00 3.052e-01 -3.528 0.000418 ***
## monthmar 1.580e+00 3.764e-01 4.197 2.71e-05 ***
## monthmay -3.981e-01 1.880e-01 -2.117 0.034230 *
## monthnov -3.713e-01 2.658e-01 -1.397 0.162413
## monthoct 1.584e-01 3.669e-01 0.432 0.666055
## monthsep 9.180e-01 4.913e-01 1.869 0.061670 .
## campaign -2.066e-02 1.842e-02 -1.122 0.262045
## pdays -1.036e-03 8.363e-04 -1.239 0.215338
## previous 8.733e-02 2.103e-01 0.415 0.677964
## poutcomenonexistent 5.814e-01 2.758e-01 2.108 0.035037 *
## poutcomesuccess 8.731e-01 8.334e-01 1.048 0.294771
## emp.var.rate -1.964e+00 3.420e-01 -5.743 9.29e-09 ***
## cons.price.idx 2.726e+00 5.864e-01 4.649 3.33e-06 ***
## cons.conf.idx 5.600e-02 2.076e-02 2.698 0.006974 **
## euribor3m 9.084e-02 3.059e-01 0.297 0.766502
## nr.employed 1.538e-02 6.945e-03 2.214 0.026835 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3210.2 on 2968 degrees of freedom
## AIC: 3274.2
##
## Number of Fisher Scoring iterations: 5
#confint(simple.logistics.all.significant)
#Removing job variable as it does not look statistically significant in model (Got lowest AIC 3270 here)
simple.logistics.all1 <-glm(y~ contact + month + campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
summary(simple.logistics.all1)
##
## Call:
## glm(formula = y ~ contact + month + campaign + pdays + previous +
## poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
## euribor3m + nr.employed, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7664 -0.8295 -0.6153 0.8030 1.9294
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.389e+02 8.668e+01 -3.910 9.23e-05 ***
## contacttelephone -6.573e-01 1.679e-01 -3.914 9.06e-05 ***
## monthaug 7.622e-01 3.298e-01 2.311 0.020811 *
## monthdec -3.211e-01 5.328e-01 -0.603 0.546708
## monthjul 7.599e-02 2.188e-01 0.347 0.728324
## monthjun -1.105e+00 3.038e-01 -3.637 0.000276 ***
## monthmar 1.565e+00 3.750e-01 4.174 3.00e-05 ***
## monthmay -4.078e-01 1.864e-01 -2.187 0.028734 *
## monthnov -3.809e-01 2.652e-01 -1.436 0.151034
## monthoct 1.435e-01 3.659e-01 0.392 0.695019
## monthsep 9.011e-01 4.901e-01 1.839 0.065967 .
## campaign -2.071e-02 1.836e-02 -1.128 0.259254
## pdays -1.024e-03 8.350e-04 -1.226 0.220035
## previous 8.477e-02 2.094e-01 0.405 0.685613
## poutcomenonexistent 5.872e-01 2.750e-01 2.135 0.032753 *
## poutcomesuccess 8.967e-01 8.324e-01 1.077 0.281411
## emp.var.rate -1.988e+00 3.409e-01 -5.832 5.46e-09 ***
## cons.price.idx 2.775e+00 5.839e-01 4.752 2.01e-06 ***
## cons.conf.idx 5.793e-02 2.068e-02 2.802 0.005082 **
## euribor3m 7.810e-02 3.047e-01 0.256 0.797719
## nr.employed 1.586e-02 6.913e-03 2.294 0.021786 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3214.1 on 2979 degrees of freedom
## AIC: 3256.1
##
## Number of Fisher Scoring iterations: 5
#confint(simple.logistics.all1)
#Removing month variable as it does not look practically significant
simple.logistics.all2 <-glm(y~ contact + campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
summary(simple.logistics.all2)
##
## Call:
## glm(formula = y ~ contact + campaign + pdays + previous + poutcome +
## emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m +
## nr.employed, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6757 -0.8045 -0.5392 0.9808 2.0543
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.014e+02 3.388e+01 -2.994 0.00276 **
## contacttelephone -8.871e-01 1.277e-01 -6.946 3.76e-12 ***
## campaign -1.367e-02 1.800e-02 -0.760 0.44742
## pdays -9.384e-04 8.256e-04 -1.137 0.25571
## previous 1.421e-01 2.081e-01 0.683 0.49478
## poutcomenonexistent 8.014e-01 2.708e-01 2.960 0.00308 **
## poutcomesuccess 1.015e+00 8.271e-01 1.227 0.21983
## emp.var.rate -9.492e-01 1.573e-01 -6.036 1.58e-09 ***
## cons.price.idx 1.157e+00 2.250e-01 5.143 2.71e-07 ***
## cons.conf.idx 5.223e-02 1.324e-02 3.944 8.03e-05 ***
## euribor3m 1.930e-01 1.776e-01 1.086 0.27732
## nr.employed -1.009e-03 3.149e-03 -0.320 0.74875
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3295.0 on 2988 degrees of freedom
## AIC: 3319
##
## Number of Fisher Scoring iterations: 5
#confint(simple.logistics.all2)
#Removing campaign variable as it does not look statistically significant in model
simple.logistics.all3 <-glm(y~ contact + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
summary(simple.logistics.all3)
##
## Call:
## glm(formula = y ~ contact + pdays + previous + poutcome + emp.var.rate +
## cons.price.idx + cons.conf.idx + euribor3m + nr.employed,
## family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6760 -0.8024 -0.5763 0.9853 2.0259
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.005e+02 3.387e+01 -2.967 0.00301 **
## contacttelephone -8.912e-01 1.276e-01 -6.987 2.81e-12 ***
## pdays -9.409e-04 8.257e-04 -1.139 0.25451
## previous 1.418e-01 2.080e-01 0.682 0.49523
## poutcomenonexistent 8.011e-01 2.707e-01 2.960 0.00308 **
## poutcomesuccess 1.013e+00 8.272e-01 1.225 0.22061
## emp.var.rate -9.590e-01 1.568e-01 -6.116 9.61e-10 ***
## cons.price.idx 1.155e+00 2.250e-01 5.133 2.85e-07 ***
## cons.conf.idx 5.187e-02 1.324e-02 3.918 8.94e-05 ***
## euribor3m 2.060e-01 1.769e-01 1.164 0.24432
## nr.employed -1.171e-03 3.143e-03 -0.373 0.70948
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3295.5 on 2989 degrees of freedom
## AIC: 3317.5
##
## Number of Fisher Scoring iterations: 5
#confint(simple.logistics.all3)
#Removing pdays variable as it does not look statistically significant in model
simple.logistics.all4 <-glm(y~ contact + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
summary(simple.logistics.all4)
##
## Call:
## glm(formula = y ~ contact + previous + poutcome + emp.var.rate +
## cons.price.idx + cons.conf.idx + euribor3m + nr.employed,
## family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6590 -0.8059 -0.5752 0.9858 2.0284
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.015e+02 3.387e+01 -2.998 0.00272 **
## contacttelephone -8.951e-01 1.276e-01 -7.018 2.25e-12 ***
## previous 2.466e-01 1.948e-01 1.266 0.20554
## poutcomenonexistent 8.970e-01 2.637e-01 3.401 0.00067 ***
## poutcomesuccess 1.885e+00 2.858e-01 6.595 4.25e-11 ***
## emp.var.rate -9.647e-01 1.566e-01 -6.159 7.30e-10 ***
## cons.price.idx 1.160e+00 2.250e-01 5.157 2.51e-07 ***
## cons.conf.idx 5.198e-02 1.325e-02 3.924 8.71e-05 ***
## euribor3m 2.136e-01 1.769e-01 1.208 0.22708
## nr.employed -1.271e-03 3.145e-03 -0.404 0.68598
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3297.1 on 2990 degrees of freedom
## AIC: 3317.1
##
## Number of Fisher Scoring iterations: 5
#confint(simple.logistics.all4)
#Removing previous variable as it does not look statistically significant in model
simple.logistics.all5 <-glm(y~ contact + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
summary(simple.logistics.all5)
##
## Call:
## glm(formula = y ~ contact + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m + nr.employed, family = "binomial",
## data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7025 -0.8173 -0.5716 0.9871 2.0373
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.009e+02 3.388e+01 -2.979 0.00289 **
## contacttelephone -9.103e-01 1.272e-01 -7.154 8.42e-13 ***
## poutcomenonexistent 6.087e-01 1.361e-01 4.474 7.67e-06 ***
## poutcomesuccess 1.942e+00 2.828e-01 6.866 6.58e-12 ***
## emp.var.rate -9.652e-01 1.568e-01 -6.157 7.40e-10 ***
## cons.price.idx 1.182e+00 2.247e-01 5.258 1.46e-07 ***
## cons.conf.idx 5.246e-02 1.323e-02 3.965 7.35e-05 ***
## euribor3m 2.263e-01 1.765e-01 1.282 0.19992
## nr.employed -1.723e-03 3.128e-03 -0.551 0.58182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3298.8 on 2991 degrees of freedom
## AIC: 3316.8
##
## Number of Fisher Scoring iterations: 5
#confint(simple.logistics.all5)
#Removing nr.employed variable as it does not look statistically significant in model and it looked like related to emp.var.rate
simple.logistics.all6 <-glm(y~ contact + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m, family="binomial",data=bank.additional.sample.train)
summary(simple.logistics.all6)
##
## Call:
## glm(formula = y ~ contact + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7033 -0.8110 -0.5697 0.9841 2.0445
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.178e+02 1.465e+01 -8.039 9.08e-16 ***
## contacttelephone -9.235e-01 1.250e-01 -7.386 1.51e-13 ***
## poutcomenonexistent 6.076e-01 1.361e-01 4.465 8.01e-06 ***
## poutcomesuccess 1.948e+00 2.826e-01 6.893 5.44e-12 ***
## emp.var.rate -9.857e-01 1.526e-01 -6.461 1.04e-10 ***
## cons.price.idx 1.271e+00 1.545e-01 8.228 < 2e-16 ***
## cons.conf.idx 5.754e-02 9.497e-03 6.059 1.37e-09 ***
## euribor3m 1.537e-01 1.178e-01 1.304 0.192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3299.1 on 2992 degrees of freedom
## AIC: 3315.1
##
## Number of Fisher Scoring iterations: 5
#confint(simple.logistics.all6)
#Removing euribor3m variable as it does not look statistically significant in model and it looked like related to emp.var.rate
simple.logistics<-glm(y~ contact + poutcome + emp.var.rate + cons.conf.idx + cons.price.idx , family="binomial",data=bank.additional.sample.train)
summary(simple.logistics)
##
## Call:
## glm(formula = y ~ contact + poutcome + emp.var.rate + cons.conf.idx +
## cons.price.idx, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6698 -0.7981 -0.5786 0.9835 1.9998
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.068e+02 1.200e+01 -8.903 < 2e-16 ***
## contacttelephone -8.733e-01 1.189e-01 -7.346 2.04e-13 ***
## poutcomenonexistent 6.184e-01 1.360e-01 4.548 5.42e-06 ***
## poutcomesuccess 1.931e+00 2.822e-01 6.842 7.79e-12 ***
## emp.var.rate -7.967e-01 4.555e-02 -17.490 < 2e-16 ***
## cons.conf.idx 6.046e-02 9.209e-03 6.565 5.19e-11 ***
## cons.price.idx 1.161e+00 1.294e-01 8.979 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3300.8 on 2993 degrees of freedom
## AIC: 3314.8
##
## Number of Fisher Scoring iterations: 5
confint(simple.logistics)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -130.79487834 -83.71427806
## contacttelephone -1.10795893 -0.64171154
## poutcomenonexistent 0.35273669 0.88611246
## poutcomesuccess 1.40193296 2.51426260
## emp.var.rate -0.88739499 -0.70873877
## cons.conf.idx 0.04258315 0.07870058
## cons.price.idx 0.91230508 1.41981182
Since our goal is to predict the outcome, removing the duration variable from stepwise full model even though it looked significant in the full model. default is considered in stepwise model. However, we do not want to consider it because considering unknown status as predictor would be risky. And defaultno is not significant. Practically I do not believe customer response depends on which month they were contacted (could be just a coincidence). But for now keeping it in the model.
Finalizing the below equation as it had lowest AIC among Forward, Backward and Stepwise model. Backward and Stepwise has exactly same set of predictor variables.
y ~ age + default + contact + month + day_of_week + campaign + pdays + previous + emp.var.rate + cons.price.idx + cons.conf.idx
#Suchi
#Type of Selection : Stepwise, Forward, Backward
# First Fit the full model (Excluding duration variable)
full.log.model<-glm(y~.,family="binomial",data=bank.additional.sample.train[-11])
# Stepwise model
step.model <- stepAIC(full.log.model, direction = "both", trace = FALSE)
summary(step.model)
##
## Call:
## glm(formula = y ~ housing + contact + month + day_of_week + pdays +
## poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
## nr.employed, family = "binomial", data = bank.additional.sample.train[-11])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7924 -0.8471 -0.5624 0.7967 1.9671
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.512e+02 7.392e+01 -4.752 2.02e-06 ***
## housingunknown -5.957e-01 2.981e-01 -1.999 0.045661 *
## housingyes -1.858e-01 8.868e-02 -2.095 0.036127 *
## contacttelephone -6.804e-01 1.681e-01 -4.047 5.19e-05 ***
## monthaug 7.722e-01 3.322e-01 2.325 0.020096 *
## monthdec -3.266e-01 5.235e-01 -0.624 0.532665
## monthjul 7.834e-02 2.195e-01 0.357 0.721181
## monthjun -1.116e+00 2.947e-01 -3.787 0.000153 ***
## monthmar 1.627e+00 3.570e-01 4.557 5.20e-06 ***
## monthmay -3.861e-01 1.865e-01 -2.071 0.038403 *
## monthnov -3.376e-01 2.171e-01 -1.555 0.119908
## monthoct 1.780e-01 3.188e-01 0.558 0.576521
## monthsep 9.149e-01 4.778e-01 1.915 0.055500 .
## day_of_weekmon -3.421e-01 1.424e-01 -2.403 0.016270 *
## day_of_weekthu 1.604e-01 1.378e-01 1.164 0.244428
## day_of_weektue 9.308e-02 1.414e-01 0.658 0.510471
## day_of_weekwed 7.870e-02 1.382e-01 0.569 0.569116
## pdays -1.159e-03 7.874e-04 -1.472 0.140889
## poutcomenonexistent 5.134e-01 1.440e-01 3.566 0.000363 ***
## poutcomesuccess 7.860e-01 8.078e-01 0.973 0.330524
## emp.var.rate -1.988e+00 3.377e-01 -5.888 3.91e-09 ***
## cons.price.idx 2.849e+00 5.387e-01 5.289 1.23e-07 ***
## cons.conf.idx 6.266e-02 1.349e-02 4.644 3.42e-06 ***
## nr.employed 1.704e-02 4.739e-03 3.594 0.000325 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3192.4 on 2976 degrees of freedom
## AIC: 3240.4
##
## Number of Fisher Scoring iterations: 5
AIC(step.model)
## [1] 3240.412
exp(cbind("Odds ratio" = coef(step.model), confint.default(step.model, level = 0.95)))
## Odds ratio 2.5 % 97.5 %
## (Intercept) 2.897335e-153 3.499620e-216 2.398704e-90
## housingunknown 5.511613e-01 3.072894e-01 9.885755e-01
## housingyes 8.304239e-01 6.979415e-01 9.880539e-01
## contacttelephone 5.064283e-01 3.642625e-01 7.040793e-01
## monthaug 2.164542e+00 1.128753e+00 4.150813e+00
## monthdec 7.213405e-01 2.585371e-01 2.012601e+00
## monthjul 1.081489e+00 7.033559e-01 1.662912e+00
## monthjun 3.276301e-01 1.838895e-01 5.837283e-01
## monthmar 5.087925e+00 2.527142e+00 1.024358e+01
## monthmay 6.796717e-01 4.715780e-01 9.795912e-01
## monthnov 7.134540e-01 4.661905e-01 1.091864e+00
## monthoct 1.194868e+00 6.396850e-01 2.231893e+00
## monthsep 2.496643e+00 9.787197e-01 6.368754e+00
## day_of_weekmon 7.102951e-01 5.373498e-01 9.389026e-01
## day_of_weekthu 1.173992e+00 8.961087e-01 1.538048e+00
## day_of_weektue 1.097545e+00 8.318350e-01 1.448129e+00
## day_of_weekwed 1.081880e+00 8.251246e-01 1.418531e+00
## pdays 9.988412e-01 9.973008e-01 1.000384e+00
## poutcomenonexistent 1.670974e+00 1.260140e+00 2.215751e+00
## poutcomesuccess 2.194611e+00 4.505936e-01 1.068882e+01
## emp.var.rate 1.369154e-01 7.063030e-02 2.654076e-01
## cons.price.idx 1.727765e+01 6.010540e+00 4.966561e+01
## cons.conf.idx 1.064666e+00 1.036878e+00 1.093198e+00
## nr.employed 1.017181e+00 1.007776e+00 1.026674e+00
vif(step.model)
## GVIF Df GVIF^(1/(2*Df))
## housing 1.026534 2 1.006568
## contact 3.067376 1 1.751393
## month 44.211482 9 1.234294
## day_of_week 1.050005 4 1.006118
## pdays 10.030446 1 3.167088
## poutcome 11.923293 2 1.858228
## emp.var.rate 163.142930 1 12.772742
## cons.price.idx 53.864403 1 7.339237
## cons.conf.idx 2.335350 1 1.528185
## nr.employed 73.248971 1 8.558561
# forward regression model
foward.model <- stepAIC(full.log.model, direction = "forward", trace = FALSE)
AIC(foward.model)
## [1] 3276.491
summary(foward.model)
##
## Call:
## glm(formula = y ~ age + job + marital + education + default +
## housing + loan + contact + month + day_of_week + campaign +
## pdays + previous + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m + nr.employed, family = "binomial",
## data = bank.additional.sample.train[-11])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7426 -0.8526 -0.5169 0.7791 1.9976
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.255e+02 8.812e+01 -3.694 0.000221 ***
## age -2.827e-03 5.364e-03 -0.527 0.598110
## jobblue-collar 1.936e-01 1.656e-01 1.169 0.242272
## jobentrepreneur 4.790e-02 2.562e-01 0.187 0.851692
## jobhousemaid 2.028e-01 3.088e-01 0.657 0.511378
## jobmanagement -1.241e-01 1.852e-01 -0.670 0.502542
## jobretired 3.352e-01 2.579e-01 1.300 0.193764
## jobself-employed 1.303e-01 2.675e-01 0.487 0.626233
## jobservices 2.295e-01 1.816e-01 1.264 0.206383
## jobstudent 1.812e-01 2.865e-01 0.632 0.527119
## jobtechnician -4.346e-02 1.541e-01 -0.282 0.777878
## jobunemployed 3.703e-01 2.931e-01 1.264 0.206389
## jobunknown -7.066e-02 4.999e-01 -0.141 0.887588
## maritalmarried 1.892e-01 1.449e-01 1.306 0.191436
## maritalsingle 1.085e-01 1.662e-01 0.653 0.513944
## maritalunknown 5.728e-01 9.439e-01 0.607 0.543907
## educationbasic.6y -1.761e-01 2.420e-01 -0.728 0.466658
## educationbasic.9y -1.378e-01 1.948e-01 -0.707 0.479377
## educationhigh.school 3.488e-02 1.945e-01 0.179 0.857658
## educationilliterate 1.298e+01 2.641e+02 0.049 0.960817
## educationprofessional.course 3.823e-02 2.175e-01 0.176 0.860467
## educationuniversity.degree 2.333e-01 1.981e-01 1.178 0.238819
## educationunknown 1.316e-01 2.586e-01 0.509 0.610940
## defaultunknown -1.383e-01 1.264e-01 -1.094 0.273960
## housingunknown -5.528e-01 3.018e-01 -1.832 0.066964 .
## housingyes -1.863e-01 8.947e-02 -2.082 0.037358 *
## loanunknown NA NA NA NA
## loanyes 3.657e-02 1.212e-01 0.302 0.762963
## contacttelephone -6.546e-01 1.696e-01 -3.859 0.000114 ***
## monthaug 7.271e-01 3.361e-01 2.164 0.030496 *
## monthdec -3.786e-01 5.391e-01 -0.702 0.482478
## monthjul 1.053e-01 2.226e-01 0.473 0.636164
## monthjun -1.046e+00 3.099e-01 -3.375 0.000738 ***
## monthmar 1.594e+00 3.797e-01 4.197 2.70e-05 ***
## monthmay -3.690e-01 1.911e-01 -1.931 0.053508 .
## monthnov -3.915e-01 2.687e-01 -1.457 0.145151
## monthoct 1.187e-01 3.706e-01 0.320 0.748747
## monthsep 8.660e-01 4.946e-01 1.751 0.079961 .
## day_of_weekmon -3.535e-01 1.434e-01 -2.466 0.013679 *
## day_of_weekthu 1.449e-01 1.390e-01 1.043 0.296987
## day_of_weektue 8.182e-02 1.429e-01 0.573 0.566843
## day_of_weekwed 6.741e-02 1.396e-01 0.483 0.629126
## campaign -1.860e-02 1.848e-02 -1.006 0.314334
## pdays -1.030e-03 8.419e-04 -1.224 0.221133
## previous 8.111e-02 2.106e-01 0.385 0.700062
## poutcomenonexistent 5.676e-01 2.768e-01 2.051 0.040277 *
## poutcomesuccess 8.656e-01 8.383e-01 1.032 0.301843
## emp.var.rate -1.943e+00 3.450e-01 -5.630 1.80e-08 ***
## cons.price.idx 2.682e+00 5.935e-01 4.519 6.21e-06 ***
## cons.conf.idx 5.701e-02 2.100e-02 2.715 0.006624 **
## euribor3m 9.952e-02 3.091e-01 0.322 0.747487
## nr.employed 1.491e-02 7.028e-03 2.122 0.033838 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3174.5 on 2949 degrees of freedom
## AIC: 3276.5
##
## Number of Fisher Scoring iterations: 12
# backward regression model
backward.model <- stepAIC(full.log.model, direction = "backward", trace = FALSE)
AIC(backward.model)
## [1] 3240.412
summary(backward.model)
##
## Call:
## glm(formula = y ~ housing + contact + month + day_of_week + pdays +
## poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
## nr.employed, family = "binomial", data = bank.additional.sample.train[-11])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7924 -0.8471 -0.5624 0.7967 1.9671
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.512e+02 7.392e+01 -4.752 2.02e-06 ***
## housingunknown -5.957e-01 2.981e-01 -1.999 0.045661 *
## housingyes -1.858e-01 8.868e-02 -2.095 0.036127 *
## contacttelephone -6.804e-01 1.681e-01 -4.047 5.19e-05 ***
## monthaug 7.722e-01 3.322e-01 2.325 0.020096 *
## monthdec -3.266e-01 5.235e-01 -0.624 0.532665
## monthjul 7.834e-02 2.195e-01 0.357 0.721181
## monthjun -1.116e+00 2.947e-01 -3.787 0.000153 ***
## monthmar 1.627e+00 3.570e-01 4.557 5.20e-06 ***
## monthmay -3.861e-01 1.865e-01 -2.071 0.038403 *
## monthnov -3.376e-01 2.171e-01 -1.555 0.119908
## monthoct 1.780e-01 3.188e-01 0.558 0.576521
## monthsep 9.149e-01 4.778e-01 1.915 0.055500 .
## day_of_weekmon -3.421e-01 1.424e-01 -2.403 0.016270 *
## day_of_weekthu 1.604e-01 1.378e-01 1.164 0.244428
## day_of_weektue 9.308e-02 1.414e-01 0.658 0.510471
## day_of_weekwed 7.870e-02 1.382e-01 0.569 0.569116
## pdays -1.159e-03 7.874e-04 -1.472 0.140889
## poutcomenonexistent 5.134e-01 1.440e-01 3.566 0.000363 ***
## poutcomesuccess 7.860e-01 8.078e-01 0.973 0.330524
## emp.var.rate -1.988e+00 3.377e-01 -5.888 3.91e-09 ***
## cons.price.idx 2.849e+00 5.387e-01 5.289 1.23e-07 ***
## cons.conf.idx 6.266e-02 1.349e-02 4.644 3.42e-06 ***
## nr.employed 1.704e-02 4.739e-03 3.594 0.000325 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3192.4 on 2976 degrees of freedom
## AIC: 3240.4
##
## Number of Fisher Scoring iterations: 5
#fitting a simple model by using only 4 predictors which appeared to be significant in EDA and p-values from previous step (Hollie)
simple.logistics.trained<-glm(y~ euribor3m + cons.price.idx + poutcome + cons.conf.idx, family="binomial",data=bank.additional.sample.train)
summary(simple.logistics.trained)
##
## Call:
## glm(formula = y ~ euribor3m + cons.price.idx + poutcome + cons.conf.idx,
## family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5747 -0.8565 -0.6697 0.9747 1.8015
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -25.517362 8.533832 -2.990 0.00279 **
## euribor3m -0.581088 0.034205 -16.988 < 2e-16 ***
## cons.price.idx 0.309276 0.092618 3.339 0.00084 ***
## poutcomenonexistent 0.521974 0.133147 3.920 8.84e-05 ***
## poutcomesuccess 2.049307 0.279601 7.329 2.31e-13 ***
## cons.conf.idx 0.054159 0.008641 6.268 3.66e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3374.7 on 2994 degrees of freedom
## AIC: 3386.7
##
## Number of Fisher Scoring iterations: 5
confint(simple.logistics.trained)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -42.39522044 -8.92090766
## euribor3m -0.64907943 -0.51492579
## cons.price.idx 0.12923340 0.49253047
## poutcomenonexistent 0.26144451 0.78365901
## poutcomesuccess 1.52626861 2.62844159
## cons.conf.idx 0.03735675 0.07124337
# Performance comparison for simple logistic model (Kenny)
#simple.logistics.trained<-glm(y~ euribor3m + cons.price.idx + poutcome + cons.conf.idx, family="binomial",data=bank.additional.sample.train)
fit.simple.logistics.pred<-predict(step.model, newdata = bank.additional.sample.test, type = "response")
#Suchi (removing 1 record from test where factor default )
#fit.simple.logistics.pred<-predict(step.model, newdata = bank.additional.sample.test, type = "response")
cutoff<-0.5
class.simple.logistics<-factor(ifelse(fit.simple.logistics.pred>cutoff,"yes","no"),levels=c("no","yes"))
conf.simple.logistics<-confusionMatrix(table(class.simple.logistics,bank.additional.sample.test$y), positive = 'yes')
conf.simple.logistics
## Confusion Matrix and Statistics
##
##
## class.simple.logistics no yes
## no 411 208
## yes 76 305
##
## Accuracy : 0.716
## 95% CI : (0.6869, 0.7438)
## No Information Rate : 0.513
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4355
##
## Mcnemar's Test P-Value : 7.639e-15
##
## Sensitivity : 0.5945
## Specificity : 0.8439
## Pos Pred Value : 0.8005
## Neg Pred Value : 0.6640
## Prevalence : 0.5130
## Detection Rate : 0.3050
## Detection Prevalence : 0.3810
## Balanced Accuracy : 0.7192
##
## 'Positive' Class : yes
##
results.simple.logistics<-prediction(fit.simple.logistics.pred,bank.additional.sample.test$y,label.ordering=c("no","yes"))
roc.simple.logistics <- performance(results.simple.logistics, measure = "tpr", x.measure = "fpr")
plot(roc.simple.logistics,colorize = TRUE)
abline(a=0, b=1)
Removing duration variable here before trying LASSO to compare the performance with stepwise
# LASSO (Suchi)
dat.train.x <- model.matrix(y ~ .,bank.additional.sample.train[-11])
dat.train.y<-bank.additional.sample.train[,21]
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
plot(cvfit)
coef(cvfit, s = "lambda.min")
## 54 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -78.261787018
## (Intercept) .
## age -0.001637056
## jobblue-collar 0.087205824
## jobentrepreneur .
## jobhousemaid 0.061354472
## jobmanagement -0.129059410
## jobretired 0.243481422
## jobself-employed 0.068456428
## jobservices 0.143912659
## jobstudent 0.150250085
## jobtechnician -0.065976214
## jobunemployed 0.267901150
## jobunknown .
## maritalmarried 0.113492457
## maritalsingle 0.029054379
## maritalunknown 0.293777788
## educationbasic.6y -0.150084551
## educationbasic.9y -0.117252962
## educationhigh.school .
## educationilliterate 2.737057683
## educationprofessional.course .
## educationuniversity.degree 0.158908707
## educationunknown 0.080193552
## defaultunknown -0.132160923
## defaultyes .
## housingunknown -0.406167726
## housingyes -0.169472352
## loanunknown -0.079018009
## loanyes 0.005044221
## contacttelephone -0.416196425
## monthaug .
## monthdec -0.504244424
## monthjul 0.076878962
## monthjun -0.297748659
## monthmar 1.000946604
## monthmay -0.631007335
## monthnov -0.420164077
## monthoct 0.149219311
## monthsep 0.101023272
## day_of_weekmon -0.373512545
## day_of_weekthu 0.087494087
## day_of_weektue 0.019468612
## day_of_weekwed 0.005810514
## campaign -0.019773486
## pdays -0.001213301
## previous .
## poutcomenonexistent 0.433379460
## poutcomesuccess 0.597784166
## emp.var.rate -0.725772662
## cons.price.idx 0.866413405
## cons.conf.idx 0.045285626
## euribor3m .
## nr.employed .
print("CV Error Rate:")
## [1] "CV Error Rate:"
cvfit$cvm[which(cvfit$lambda==cvfit$lambda.min)]
## [1] 0.261
#Optimal penalty
print("Penalty Value:")
## [1] "Penalty Value:"
cvfit$lambda.min
## [1] 0.001412619
#For final model predictions go ahead and refit lasso using entire
#data set
lasso.log.final<-glmnet(dat.train.x, dat.train.y, family = "binomial",lambda=cvfit$lambda.min)
dat.test.x<-model.matrix(y ~ .,bank.additional.sample.test[-11])
fit.pred.lasso <- predict(lasso.log.final, newx = dat.test.x, type = "response")
fit.pred.step<-predict(step.model,newdata=bank.additional.sample.test,type="response")
#confusion matrix for LASSO (Suchi)
#Using cutoff of 0.5 to make the classification.
cutoff<-0.5
class.lasso<-factor(ifelse(fit.pred.lasso>cutoff,"yes","no"),levels=c("no","yes"))
class.step<-factor(ifelse(fit.pred.step>cutoff,"yes","no"),levels=c("no","yes"))
#Confusion Matrix for Lasso
conf.lasso<-table(class.lasso,bank.additional.sample.test$y)
print("Confusion matrix for LASSO")
## [1] "Confusion matrix for LASSO"
conf.lasso
##
## class.lasso no yes
## no 406 204
## yes 81 309
conf.step<-table(class.step,bank.additional.sample.test$y)
print("Confusion matrix for Stepwise")
## [1] "Confusion matrix for Stepwise"
conf.step
##
## class.step no yes
## no 411 208
## yes 76 305
#Overall Accuracy of LASSO and Stepwise
print("Overall accuracy for LASSO and Stepwise respectively")
## [1] "Overall accuracy for LASSO and Stepwise respectively"
sum(diag(conf.lasso))/sum(conf.lasso)
## [1] 0.715
sum(diag(conf.step))/sum(conf.step)
## [1] 0.716
# ROC plot for LASSO (Suchi)
results.lasso<-prediction(fit.pred.lasso, bank.additional.sample.test$y,label.ordering=c("no","yes"))
roc.lasso <- performance(results.lasso, measure = "tpr", x.measure = "fpr")
plot(roc.lasso,colorize = TRUE)
abline(a=0, b= 1)
#Fitting a complex model using 4 variables and 1 interaction (Kenny)
complex.interaction<-glm(y~ euribor3m + cons.price.idx + poutcome + cons.conf.idx + cons.price.idx*cons.conf.idx, family="binomial",data=bank.additional.sample.train)
summary(complex.interaction)
##
## Call:
## glm(formula = y ~ euribor3m + cons.price.idx + poutcome + cons.conf.idx +
## cons.price.idx * cons.conf.idx, family = "binomial", data = bank.additional.sample.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7452 -0.8067 -0.6823 1.0055 1.7825
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 254.48918 56.44127 4.509 6.52e-06 ***
## euribor3m -0.55084 0.03446 -15.986 < 2e-16 ***
## cons.price.idx -2.70092 0.60665 -4.452 8.50e-06 ***
## poutcomenonexistent 0.53884 0.13494 3.993 6.52e-05 ***
## poutcomesuccess 1.98992 0.28157 7.067 1.58e-12 ***
## cons.conf.idx 6.91006 1.37533 5.024 5.05e-07 ***
## cons.price.idx:cons.conf.idx -0.07368 0.01478 -4.986 6.16e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.7 on 2999 degrees of freedom
## Residual deviance: 3347.4 on 2993 degrees of freedom
## AIC: 3361.4
##
## Number of Fisher Scoring iterations: 5
confint(complex.interaction)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 145.6665772 367.12728215
## euribor3m -0.6193081 -0.48416238
## cons.price.idx -3.9113653 -1.53106520
## poutcomenonexistent 0.2750174 0.80429988
## poutcomesuccess 1.4625421 2.57243656
## cons.conf.idx 4.2661557 9.66177624
## cons.price.idx:cons.conf.idx -0.1032418 -0.04526933
#Fitting a complex model adding 1 new variable by categorizing euribor3m (Suchi)
# bank.additional.sample.train$euribor3m_updated <- factor(ifelse(bank.additional.sample.train$euribor3m>median(bank.additional.sample.train$euribor3m),"High","Low"),levels=c("Low","High"))
# summary(glm(y~euribor3m_updated , family="binomial",data=bank.additional.sample.train))
# complex.logistics.grouping<-glm(y~ euribor3m_updated + age + contact + month + day_of_week + campaign + pdays + previous + emp.var.rate + cons.price.idx + cons.conf.idx , family="binomial",data=bank.additional.sample.train)
# summary(complex.logistics.grouping)
# confint(complex.logistics.grouping)
# Performance comparison for simple logistic model + interaction term (Kenny)
complex.interaction.trained<-glm(y~ euribor3m + cons.price.idx + poutcome + cons.conf.idx + cons.price.idx*cons.conf.idx, family="binomial",data=bank.additional.sample.train)
fit.complex.interaction.pred<-predict(complex.interaction.trained, newdata = bank.additional.sample.test, type = "response")
cutoff<-0.5
class.complex.interaction<-factor(ifelse(fit.complex.interaction.pred>cutoff,"yes","no"),levels=c("no","yes"))
# conf.complex.interaction<-table(class.complex.interaction,bank.additional.sample.test$y)
conf.complex.interaction<-confusionMatrix(table(class.complex.interaction,bank.additional.sample.test$y), positive = 'yes')
conf.complex.interaction
## Confusion Matrix and Statistics
##
##
## class.complex.interaction no yes
## no 377 197
## yes 110 316
##
## Accuracy : 0.693
## 95% CI : (0.6634, 0.7215)
## No Information Rate : 0.513
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3884
##
## Mcnemar's Test P-Value : 9.188e-07
##
## Sensitivity : 0.6160
## Specificity : 0.7741
## Pos Pred Value : 0.7418
## Neg Pred Value : 0.6568
## Prevalence : 0.5130
## Detection Rate : 0.3160
## Detection Prevalence : 0.4260
## Balanced Accuracy : 0.6951
##
## 'Positive' Class : yes
##
# mean(class.complex.interaction==bank.additional.sample.test$y)
results.complex.interaction<-prediction(fit.complex.interaction.pred,bank.additional.sample.test$y,label.ordering=c("no","yes"))
roc.complex.interaction = performance(results.complex.interaction, measure = "tpr", x.measure = "fpr")
plot(roc.complex.interaction,col = "blue")
plot(roc.simple.logistics,col = "red",add=TRUE)
plot(roc.lasso,col = "green",add=TRUE)
legend("bottomright",legend=c("Simple","Added Interaction", "LASSO"),col=c("blue","red","green"),lty=1,lwd=1)
abline(a=0, b=1)
#Objective 2 (Point 3) -- LDA on Sample data (Suchi: Objective 2: Point 3) # Did not use duration variable
bank.additional.lda <- lda(y ~ ., bank.additional.sample.train[c(1,10:14,16:21)])
bank.additional.lda.p <- predict(bank.additional.lda, bank.additional.sample.test)$class
table.lda <- table(bank.additional.lda.p, bank.additional.sample.test$y)
cm.lda = confusionMatrix(table.lda, positive = 'yes')
cm.lda
## Confusion Matrix and Statistics
##
##
## bank.additional.lda.p no yes
## no 404 108
## yes 83 405
##
## Accuracy : 0.809
## 95% CI : (0.7832, 0.8329)
## No Information Rate : 0.513
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6182
##
## Mcnemar's Test P-Value : 0.08246
##
## Sensitivity : 0.7895
## Specificity : 0.8296
## Pos Pred Value : 0.8299
## Neg Pred Value : 0.7891
## Prevalence : 0.5130
## Detection Rate : 0.4050
## Detection Prevalence : 0.4880
## Balanced Accuracy : 0.8095
##
## 'Positive' Class : yes
##
#Random Forest (Suchi: Optional) # Did not use duration variable
bank.rf<- randomForest(y~., data = bank.additional.sample.train[-11], importance=TRUE)
bank.rf
##
## Call:
## randomForest(formula = y ~ ., data = bank.additional.sample.train[-11], importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 25.53%
## Confusion matrix:
## no yes class.error
## no 1253 260 0.1718440
## yes 506 981 0.3402824
varImpPlot(bank.rf)
plot(bank.rf)
bank.rf.pred<- predict(bank.rf, bank.additional.sample.test)
##Confusion Matrix
table.rf <- table(bank.rf.pred,bank.additional.sample.test$y)
cm.rf = confusionMatrix(table.rf, positive = 'yes')
cm.rf
## Confusion Matrix and Statistics
##
##
## bank.rf.pred no yes
## no 409 199
## yes 78 314
##
## Accuracy : 0.723
## 95% CI : (0.6941, 0.7505)
## No Information Rate : 0.513
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4491
##
## Mcnemar's Test P-Value : 5.591e-13
##
## Sensitivity : 0.6121
## Specificity : 0.8398
## Pos Pred Value : 0.8010
## Neg Pred Value : 0.6727
## Prevalence : 0.5130
## Detection Rate : 0.3140
## Detection Prevalence : 0.3920
## Balanced Accuracy : 0.7260
##
## 'Positive' Class : yes
##
After running thru 50 iterations on same sample data set and running thru 100 iteration on different split dataset, we see that at K=32, the model has highest performance.
#KNN (Hollie)
set.seed(123)
splitPerc = .75
iterations = 100
numks = 50
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(50), k = numeric(50))
trainIndices = sample(1:dim(bank.additional.sample)[1],round(splitPerc * dim(bank.additional.sample)[1]))
train = bank.additional.sample[trainIndices,]
test = bank.additional.sample[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train[,c(17,19)],test[,c(17,19)],train$y, prob = TRUE, k = i)
table(classifications,test$y)
CM = confusionMatrix(table(classifications,test$y))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
plot(seq(1,numks,1),MeanAcc, type = "l")
## Loop for many k and one training / test partition
accs = data.frame(accuracy = numeric(50), k = numeric(50))
for(i in 1:50)
{
classifications = knn(bank.additional.sample.train[,c(17,19)],bank.additional.sample.test[,c(17,19)],bank.additional.sample.train$y, prob = TRUE, k = i)
table(bank.additional.sample.test$y,classifications)
CM = confusionMatrix(table(bank.additional.sample.test$y,classifications),positive = 'yes')
accs$accuracy[i] = CM$overall[1]
accs$k[i] = i
}
plot(accs$k,accs$accuracy, type = "l", xlab = "k")
# k = 10 (optimal that we found)
classifications = knn(bank.additional.sample.train[,c(17,20)],bank.additional.sample.test[,c(17,20)],bank.additional.sample.train$y, prob = TRUE, k = 10)
table(bank.additional.sample.test$y,classifications)
## classifications
## no yes
## no 416 71
## yes 225 288
confusionMatrix(table(bank.additional.sample.test$y,classifications) ,positive = 'yes')
## Confusion Matrix and Statistics
##
## classifications
## no yes
## no 416 71
## yes 225 288
##
## Accuracy : 0.704
## 95% CI : (0.6746, 0.7322)
## No Information Rate : 0.641
## P-Value [Acc > NIR] : 1.475e-05
##
## Kappa : 0.4123
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8022
## Specificity : 0.6490
## Pos Pred Value : 0.5614
## Neg Pred Value : 0.8542
## Prevalence : 0.3590
## Detection Rate : 0.2880
## Detection Prevalence : 0.5130
## Balanced Accuracy : 0.7256
##
## 'Positive' Class : yes
##
cm.knn = confusionMatrix(table(bank.additional.sample.test$y,classifications),positive = 'yes')
cm.knn$overall[1]
## Accuracy
## 0.704
#Suchi
#Comparing metrics from all model
print("Simple model Overall Accuracy:")
## [1] "Simple model Overall Accuracy:"
conf.simple.logistics$overall[1]
## Accuracy
## 0.716
print("Complex model Overall Accuracy:")
## [1] "Complex model Overall Accuracy:"
conf.complex.interaction$overall[1]
## Accuracy
## 0.693
print("Random Forest Overall Accuracy:")
## [1] "Random Forest Overall Accuracy:"
cm.rf$overall[1]
## Accuracy
## 0.723
print("LDA Overall Accuracy:")
## [1] "LDA Overall Accuracy:"
cm.lda$overall[1]
## Accuracy
## 0.809
print("KNN Overall Accuracy:")
## [1] "KNN Overall Accuracy:"
cm.knn$overall[1]
## Accuracy
## 0.704
print("Simple model Misclassification rate:")
## [1] "Simple model Misclassification rate:"
1-sum(diag(conf.simple.logistics$table))/sum(conf.simple.logistics$table)
## [1] 0.284
print("Complex model Misclassification rate:")
## [1] "Complex model Misclassification rate:"
1-sum(diag(conf.complex.interaction$table))/sum(conf.complex.interaction$table)
## [1] 0.307
print("RandomForest Misclassification rate:")
## [1] "RandomForest Misclassification rate:"
1-sum(diag(cm.rf$table))/sum(cm.rf$table)
## [1] 0.277
print("LDA Misclassification rate:")
## [1] "LDA Misclassification rate:"
1-sum(diag(cm.lda$table))/sum(cm.lda$table)
## [1] 0.191
print("KNN Misclassification rate:")
## [1] "KNN Misclassification rate:"
1-sum(diag(cm.knn$table))/sum(cm.knn$table)
## [1] 0.296